notebook.community

Edit and run



In [2]:

    
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import pandas as pd



In [3]:

    
# load the iris datasets
# for info on this dataset, refer to the logistic_regression script
dataset = datasets.load_iris()



In [4]:

    
#Let us now build a pandas dataframe hosting the data at hand

# We first need the list of feature names for our columns
# It is already stored in the dataset. Let's use it
lfeat = dataset.feature_names



In [5]:

    
# We now build the Dataframe, with the data as argument
# and the list of column names as keyword argument
df_iris = pd.DataFrame(dataset.data, columns = lfeat)



In [7]:

    
print "Printing data up to the 5th sample"
df_iris.iloc[:5,:] # Look at the first 5 samples for all features.









    



Printing data up to the 5th sample






    Out[7]:






  
    
      
      sepal length (cm)
      sepal width (cm)
      petal length (cm)
      petal width (cm)
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
    
    
      1
      4.9
      3.0
      1.4
      0.2
    
    
      2
      4.7
      3.2
      1.3
      0.2
    
    
      3
      4.6
      3.1
      1.5
      0.2
    
    
      4
      5.0
      3.6
      1.4
      0.2



In [8]:

    
# We also want to add the regression target
# Let's create a new column :
df_iris["Species"] = dataset.target # Must have the correct size of course



In [9]:

    
#Let's review our complete dataframe:
print
print "Printing data up to the 5th sample"
print "Also print the target"
df_iris.iloc[:5,:] # Look at the first 5 samples for all features incuding target









    



Printing data up to the 5th sample
Also print the target






    Out[9]:






  
    
      
      sepal length (cm)
      sepal width (cm)
      petal length (cm)
      petal width (cm)
      Species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      0
    
    
      1
      4.9
      3.0
      1.4
      0.2
      0
    
    
      2
      4.7
      3.2
      1.3
      0.2
      0
    
    
      3
      4.6
      3.1
      1.5
      0.2
      0
    
    
      4
      5.0
      3.6
      1.4
      0.2
      0



In [10]:

    
# we are now going to fit a Decision Tree model to the data

# Let's use an example to understand what decision trees do
# Picture a doctor and his sick patient
# The doctor follows a protocol to find out what ails the patient
# He may ask : how old are you, where does it hurt and so on
# This will allow him to narrow down the options and eventually
# find out the problem

# Decision trees proceed in the same way :
# They make a series of separation in the feature space
# e.g. if feat1 > c => classify as class 1
# The features on which to make the separation and the threshold value
# are learnt on the training data  by optimising a criterion like minimising the classification error at each split

#As before, we create an instance of the model
model = DecisionTreeClassifier()



In [11]:

    
# Which we then fit to the training data X, Y
# with pandas we have to split the df in two :
# the feature part (X) and the target part (Y)
# This is done below :

data = df_iris[lfeat].values
target = df_iris["Species"].values
model.fit(data, target)
print(model)









    



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [12]:

    
# make predictions
# as before, we can use the model to make predictions on any data
expected = target
predicted = model.predict(data)
# and evaluate the performance of the classification with standard metrics
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))









    



             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      1.00      1.00        50
          2       1.00      1.00      1.00        50

avg / total       1.00      1.00      1.00       150

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]



In [ ]:

    
# This time we can see we got a perfect prediction - again

Decision surface of a decision tree using paired features

An example of plotting a

http://scikit-learn.org/stable/auto_examples/tree/plot_iris.html



In [19]:

    
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

# Parameters
n_classes = 3
plot_colors = "bry"
plot_step = 0.02

# Load data
iris = load_iris()

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                [1, 2], [1, 3], [2, 3]]):
    # We only take the two corresponding features
    X = iris.data[:, pair]
    y = iris.target

    # Shuffle
    idx = np.arange(X.shape[0])
    np.random.seed(13)
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]

    # Standardize
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])
    plt.axis("tight")

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.Paired)

    plt.axis("tight")

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()



In [ ]:

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2